#1 MVP We’ve looked at a few different ways in which we can build models this week, including how to prepare them properly. This weekend we’ll build a multiple linear regression model on a dataset which will need some preparation. The data has come from Kaggle and can be found in the data folder.

We want to model avocado sales. You’ll need to identify the target variable and use the tools we’ve worked with this week in order to prepare your dataset and find appropriate predictors. Once you’ve built your model use the validation techniques discussed on Wednesday to evaluate it.

#2 Extensions Build a decision tree to model the likelihood of a sale being of an organic avocado. Use k-means clustering to investigate potential relationships between the year and the average avocado price.

library(tidyverse)
#loading in data
avocado <- read.csv("data/avocado.csv")
# investigate structure and summary of data
str(avocado)
'data.frame':   18249 obs. of  14 variables:
 $ X           : int  0 1 2 3 4 5 6 7 8 9 ...
 $ Date        : Factor w/ 169 levels "2015-01-04","2015-01-11",..: 52 51 50 49 48 47 46 45 44 43 ...
 $ AveragePrice: num  1.33 1.35 0.93 1.08 1.28 1.26 0.99 0.98 1.02 1.07 ...
 $ Total.Volume: num  64237 54877 118220 78992 51040 ...
 $ X4046       : num  1037 674 795 1132 941 ...
 $ X4225       : num  54455 44639 109150 71976 43838 ...
 $ X4770       : num  48.2 58.3 130.5 72.6 75.8 ...
 $ Total.Bags  : num  8697 9506 8145 5811 6184 ...
 $ Small.Bags  : num  8604 9408 8042 5677 5986 ...
 $ Large.Bags  : num  93.2 97.5 103.1 133.8 197.7 ...
 $ XLarge.Bags : num  0 0 0 0 0 0 0 0 0 0 ...
 $ type        : Factor w/ 2 levels "conventional",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ year        : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
 $ region      : Factor w/ 54 levels "Albany","Atlanta",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(avocado)
       X                 Date        AveragePrice    Total.Volume     
 Min.   : 0.00   2015-01-04:  108   Min.   :0.440   Min.   :      85  
 1st Qu.:10.00   2015-01-11:  108   1st Qu.:1.100   1st Qu.:   10839  
 Median :24.00   2015-01-18:  108   Median :1.370   Median :  107377  
 Mean   :24.23   2015-01-25:  108   Mean   :1.406   Mean   :  850644  
 3rd Qu.:38.00   2015-02-01:  108   3rd Qu.:1.660   3rd Qu.:  432962  
 Max.   :52.00   2015-02-08:  108   Max.   :3.250   Max.   :62505647  
                 (Other)   :17601                                     
     X4046              X4225              X4770           Total.Bags      
 Min.   :       0   Min.   :       0   Min.   :      0   Min.   :       0  
 1st Qu.:     854   1st Qu.:    3009   1st Qu.:      0   1st Qu.:    5089  
 Median :    8645   Median :   29061   Median :    185   Median :   39744  
 Mean   :  293008   Mean   :  295155   Mean   :  22840   Mean   :  239639  
 3rd Qu.:  111020   3rd Qu.:  150207   3rd Qu.:   6243   3rd Qu.:  110783  
 Max.   :22743616   Max.   :20470573   Max.   :2546439   Max.   :19373134  
                                                                           
   Small.Bags         Large.Bags       XLarge.Bags                 type     
 Min.   :       0   Min.   :      0   Min.   :     0.0   conventional:9126  
 1st Qu.:    2849   1st Qu.:    127   1st Qu.:     0.0   organic     :9123  
 Median :   26363   Median :   2648   Median :     0.0                      
 Mean   :  182195   Mean   :  54338   Mean   :  3106.4                      
 3rd Qu.:   83338   3rd Qu.:  22029   3rd Qu.:   132.5                      
 Max.   :13384587   Max.   :5719097   Max.   :551693.7                      
                                                                            
      year                      region     
 Min.   :2015   Albany             :  338  
 1st Qu.:2015   Atlanta            :  338  
 Median :2016   BaltimoreWashington:  338  
 Mean   :2016   Boise              :  338  
 3rd Qu.:2017   Boston             :  338  
 Max.   :2018   BuffaloRochester   :  338  
                (Other)            :16221  
# check for missing values
apply(avocado, 2, function(x) any(is.na(x) | is.infinite(x) | is.null(x)))
           X         Date AveragePrice Total.Volume        X4046        X4225 
       FALSE        FALSE        FALSE        FALSE        FALSE        FALSE 
       X4770   Total.Bags   Small.Bags   Large.Bags  XLarge.Bags         type 
       FALSE        FALSE        FALSE        FALSE        FALSE        FALSE 
        year       region 
       FALSE        FALSE 
# get rid of spaces in column names
names(avocado) <- make.names(names(avocado))
# make all column names lower case
for( i in colnames(avocado)) {
  colnames(avocado)[which(colnames(avocado) == i)] = tolower(i)
}

avocado
library(lubridate)
avocado$year <- year(ymd(as.character(avocado$date)))
avocado$month <- month(ymd(as.character(avocado$date)))
avocado$week <- week(ymd(as.character(avocado$date)))
glimpse(avocado)
Observations: 18,249
Variables: 16
$ x            <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ date         <fct> 2015-12-27, 2015-12-20, 2015-12-13, 2015-12-06, 2015-11-29, 20…
$ averageprice <dbl> 1.33, 1.35, 0.93, 1.08, 1.28, 1.26, 0.99, 0.98, 1.02, 1.07, 1.…
$ total.volume <dbl> 64236.62, 54876.98, 118220.22, 78992.15, 51039.60, 55979.78, 8…
$ x4046        <dbl> 1036.74, 674.28, 794.70, 1132.00, 941.48, 1184.27, 1368.92, 70…
$ x4225        <dbl> 54454.85, 44638.81, 109149.67, 71976.41, 43838.39, 48067.99, 7…
$ x4770        <dbl> 48.16, 58.33, 130.50, 72.58, 75.78, 43.61, 93.26, 80.00, 85.34…
$ total.bags   <dbl> 8696.87, 9505.56, 8145.35, 5811.16, 6183.95, 6683.91, 8318.86,…
$ small.bags   <dbl> 8603.62, 9408.07, 8042.21, 5677.40, 5986.26, 6556.47, 8196.81,…
$ large.bags   <dbl> 93.25, 97.49, 103.14, 133.76, 197.69, 127.44, 122.05, 562.37, …
$ xlarge.bags  <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
$ type         <fct> conventional, conventional, conventional, conventional, conven…
$ year         <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 20…
$ region       <fct> Albany, Albany, Albany, Albany, Albany, Albany, Albany, Albany…
$ month        <dbl> 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9…
$ week         <dbl> 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37…
avocado %>%
  ggplot(aes(x = averageprice)) +
  geom_histogram() +
  facet_wrap(~ year)

# seems to be less data in 2018 compared to other years 

#table(avocado$year)
avocado %>%
  ggplot(aes(x = averageprice, fill = type)) +
  geom_histogram() +
  facet_wrap(~ month)


#table(avocado$month)
avocado %>%
  ggplot(aes(x = averageprice, fill = type)) +
  geom_histogram() +
  facet_wrap(~ week)


#table(avocado$week)
avocado %>%
  ggplot(aes(x = as.factor(year), y = averageprice, group = year)) +
  geom_boxplot()

avocado %>%
  ggplot(aes(x = as.factor(month), y = averageprice, group = month)) +
  geom_boxplot()

avocado %>%
  ggplot(aes(x = as.factor(type), y = averageprice)) +
  geom_boxplot() +
  coord_flip()

#must greater spread of prices for organic
var(avocado$averageprice)
[1] 0.1621484
mean(avocado$averageprice)
[1] 1.405978
sd(avocado$averageprice)
[1] 0.4026766
#assuming normally distributed averageprice
#approx 68 percent of avocados in our data sold at prices between $1.405978 - $0.4026766 = $1.003302 and $1.405978 + $0.4026766 = $1.808655
mean(avocado$averageprice) - sd(avocado$averageprice)
[1] 1.003302
mean(avocado$averageprice) + sd(avocado$averageprice)
[1] 1.808655
avocado %>%
  ggplot(aes(x = averageprice, fill = type)) +
  geom_histogram()

avocado %>%
  ggplot(aes(x = averageprice, fill = type)) +
  geom_histogram() +
  facet_wrap(~ type)

#more conventional than organic sold but at generally lower price.
avocado %>%
  ggplot(aes(x = region, y = averageprice, color = as.factor(year))) +
  geom_point() +
  coord_flip() +
  facet_wrap(~ year)

# great deal of variability of price around region
avocado %>%
  ggplot(aes(x = region, y = averageprice, color = as.factor(month))) +
  geom_point() +
  coord_flip() +
  facet_wrap(~ month)

# great deal of variability of price around region and month
avocado %>%
  ggplot(aes(x = region, y = averageprice, color = as.factor(year))) +
  geom_point() +
  coord_flip() +
  facet_wrap(~ type)


# and variability around year too
avocado %>%
  ggplot(aes(x = ymd(as.character(avocado$date)), y = averageprice, color = type)) +
  geom_line() +
  facet_wrap(~ type, ncol = 1)

region_table <- table(avocado$region)
round(prop.table(region_table) *100, digits = 1)

             Albany             Atlanta BaltimoreWashington               Boise 
                1.9                 1.9                 1.9                 1.9 
             Boston    BuffaloRochester          California           Charlotte 
                1.9                 1.9                 1.9                 1.9 
            Chicago    CincinnatiDayton            Columbus       DallasFtWorth 
                1.9                 1.9                 1.9                 1.9 
             Denver             Detroit         GrandRapids          GreatLakes 
                1.9                 1.9                 1.9                 1.9 
 HarrisburgScranton HartfordSpringfield             Houston        Indianapolis 
                1.9                 1.9                 1.9                 1.9 
       Jacksonville            LasVegas          LosAngeles          Louisville 
                1.9                 1.9                 1.9                 1.9 
  MiamiFtLauderdale            Midsouth           Nashville    NewOrleansMobile 
                1.9                 1.9                 1.9                 1.9 
            NewYork           Northeast  NorthernNewEngland             Orlando 
                1.9                 1.9                 1.9                 1.9 
       Philadelphia       PhoenixTucson          Pittsburgh              Plains 
                1.9                 1.9                 1.9                 1.9 
           Portland   RaleighGreensboro     RichmondNorfolk             Roanoke 
                1.9                 1.9                 1.9                 1.9 
         Sacramento            SanDiego        SanFrancisco             Seattle 
                1.9                 1.9                 1.9                 1.9 
      SouthCarolina        SouthCentral           Southeast             Spokane 
                1.9                 1.9                 1.9                 1.9 
            StLouis            Syracuse               Tampa             TotalUS 
                1.9                 1.9                 1.9                 1.9 
               West    WestTexNewMexico 
                1.9                 1.8 
type_table <- table(avocado$type)
round(prop.table(type_table) * 100, digits = 1)

conventional      organic 
          50           50 
table(avocado$year)

2015 2016 2017 2018 
5615 5616 5722 1296 
summary(avocado)
       x                 date        averageprice    total.volume     
 Min.   : 0.00   2015-01-04:  108   Min.   :0.440   Min.   :      85  
 1st Qu.:10.00   2015-01-11:  108   1st Qu.:1.100   1st Qu.:   10839  
 Median :24.00   2015-01-18:  108   Median :1.370   Median :  107377  
 Mean   :24.23   2015-01-25:  108   Mean   :1.406   Mean   :  850644  
 3rd Qu.:38.00   2015-02-01:  108   3rd Qu.:1.660   3rd Qu.:  432962  
 Max.   :52.00   2015-02-08:  108   Max.   :3.250   Max.   :62505647  
                 (Other)   :17601                                     
     x4046              x4225              x4770           total.bags      
 Min.   :       0   Min.   :       0   Min.   :      0   Min.   :       0  
 1st Qu.:     854   1st Qu.:    3009   1st Qu.:      0   1st Qu.:    5089  
 Median :    8645   Median :   29061   Median :    185   Median :   39744  
 Mean   :  293008   Mean   :  295155   Mean   :  22840   Mean   :  239639  
 3rd Qu.:  111020   3rd Qu.:  150207   3rd Qu.:   6243   3rd Qu.:  110783  
 Max.   :22743616   Max.   :20470573   Max.   :2546439   Max.   :19373134  
                                                                           
   small.bags         large.bags       xlarge.bags                 type     
 Min.   :       0   Min.   :      0   Min.   :     0.0   conventional:9126  
 1st Qu.:    2849   1st Qu.:    127   1st Qu.:     0.0   organic     :9123  
 Median :   26363   Median :   2648   Median :     0.0                      
 Mean   :  182195   Mean   :  54338   Mean   :  3106.4                      
 3rd Qu.:   83338   3rd Qu.:  22029   3rd Qu.:   132.5                      
 Max.   :13384587   Max.   :5719097   Max.   :551693.7                      
                                                                            
      year                      region          month             week      
 Min.   :2015   Albany             :  338   Min.   : 1.000   Min.   : 1.00  
 1st Qu.:2015   Atlanta            :  338   1st Qu.: 3.000   1st Qu.:11.00  
 Median :2016   BaltimoreWashington:  338   Median : 6.000   Median :25.00  
 Mean   :2016   Boise              :  338   Mean   : 6.177   Mean   :25.24  
 3rd Qu.:2017   Boston             :  338   3rd Qu.: 9.000   3rd Qu.:39.00  
 Max.   :2018   BuffaloRochester   :  338   Max.   :12.000   Max.   :53.00  
                (Other)            :16221                                   
library(psych)
pairs.panels(avocado[c("averageprice", "x4046", "x4225", "x4770", "small.bags", "large.bags", "xlarge.bags", "type", "region", "month", "week", "year")])

summary(avocado)
       x                 date        averageprice    total.volume     
 Min.   : 0.00   2015-01-04:  108   Min.   :0.440   Min.   :      85  
 1st Qu.:10.00   2015-01-11:  108   1st Qu.:1.100   1st Qu.:   10839  
 Median :24.00   2015-01-18:  108   Median :1.370   Median :  107377  
 Mean   :24.23   2015-01-25:  108   Mean   :1.406   Mean   :  850644  
 3rd Qu.:38.00   2015-02-01:  108   3rd Qu.:1.660   3rd Qu.:  432962  
 Max.   :52.00   2015-02-08:  108   Max.   :3.250   Max.   :62505647  
                 (Other)   :17601                                     
     x4046              x4225              x4770           total.bags      
 Min.   :       0   Min.   :       0   Min.   :      0   Min.   :       0  
 1st Qu.:     854   1st Qu.:    3009   1st Qu.:      0   1st Qu.:    5089  
 Median :    8645   Median :   29061   Median :    185   Median :   39744  
 Mean   :  293008   Mean   :  295155   Mean   :  22840   Mean   :  239639  
 3rd Qu.:  111020   3rd Qu.:  150207   3rd Qu.:   6243   3rd Qu.:  110783  
 Max.   :22743616   Max.   :20470573   Max.   :2546439   Max.   :19373134  
                                                                           
   small.bags         large.bags       xlarge.bags                 type     
 Min.   :       0   Min.   :      0   Min.   :     0.0   conventional:9126  
 1st Qu.:    2849   1st Qu.:    127   1st Qu.:     0.0   organic     :9123  
 Median :   26363   Median :   2648   Median :     0.0                      
 Mean   :  182195   Mean   :  54338   Mean   :  3106.4                      
 3rd Qu.:   83338   3rd Qu.:  22029   3rd Qu.:   132.5                      
 Max.   :13384587   Max.   :5719097   Max.   :551693.7                      
                                                                            
      year                      region          month             week      
 Min.   :2015   Albany             :  338   Min.   : 1.000   Min.   : 1.00  
 1st Qu.:2015   Atlanta            :  338   1st Qu.: 3.000   1st Qu.:11.00  
 Median :2016   BaltimoreWashington:  338   Median : 6.000   Median :25.00  
 Mean   :2016   Boise              :  338   Mean   : 6.177   Mean   :25.24  
 3rd Qu.:2017   Boston             :  338   3rd Qu.: 9.000   3rd Qu.:39.00  
 Max.   :2018   BuffaloRochester   :  338   Max.   :12.000   Max.   :53.00  
                (Other)            :16221                                   
# tidy up data
avocado_tidy <- avocado %>%
  select(-c("x", "date", "total.volume", "total.bags"))

glimpse(avocado_tidy)
Observations: 18,249
Variables: 12
$ averageprice <dbl> 1.33, 1.35, 0.93, 1.08, 1.28, 1.26, 0.99, 0.98, 1.02, 1.07, 1.…
$ x4046        <dbl> 1036.74, 674.28, 794.70, 1132.00, 941.48, 1184.27, 1368.92, 70…
$ x4225        <dbl> 54454.85, 44638.81, 109149.67, 71976.41, 43838.39, 48067.99, 7…
$ x4770        <dbl> 48.16, 58.33, 130.50, 72.58, 75.78, 43.61, 93.26, 80.00, 85.34…
$ small.bags   <dbl> 8603.62, 9408.07, 8042.21, 5677.40, 5986.26, 6556.47, 8196.81,…
$ large.bags   <dbl> 93.25, 97.49, 103.14, 133.76, 197.69, 127.44, 122.05, 562.37, …
$ xlarge.bags  <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
$ type         <fct> conventional, conventional, conventional, conventional, conven…
$ year         <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 20…
$ region       <fct> Albany, Albany, Albany, Albany, Albany, Albany, Albany, Albany…
$ month        <dbl> 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9…
$ week         <dbl> 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37…
# x is just reference so no predictive power, taken out date as I have year, week and month. total.volume and total.bags can be derived
# from the other data
# changing type to logical as there are only two types of avocado - didn't bother with this as noticed the model did all of this automatically.
#avocado_tidy$is.organic <- with(avocado_tidy, type=="organic")
# using alias to check is there are any aliased vairables | this result (I assume) means no aliased variables
alias(averageprice ~ ., data = avocado_tidy)
Model :
averageprice ~ x4046 + x4225 + x4770 + small.bags + large.bags + 
    xlarge.bags + type + year + region + month + week
avocado_tidy %>%
  ggplot(aes(x = averageprice, y = type)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

library(caret)
# using K-fold cross validation
# using 10 folds
# first model has everything included!
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)

model <- train(averageprice ~ ., data = avocado_tidy,
               trControl = cv_10_fold,
               method = "lm")
model$pred
model$resample
mean(model$resample$RMSE)
[1] 0.2582568
mean(model$resample$Rsquared)
[1] 0.5887593
summary(model)

Call:
lm(formula = .outcome ~ ., data = dat)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.03539 -0.15729 -0.00504  0.14804  1.51228 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)               -1.061e+02  4.269e+00 -24.849  < 2e-16 ***
x4046                      1.034e-08  5.752e-09   1.799 0.072083 .  
x4225                     -1.084e-08  6.790e-09  -1.597 0.110331    
x4770                     -3.255e-09  4.421e-08  -0.074 0.941308    
small.bags                -1.777e-08  1.206e-08  -1.474 0.140629    
large.bags                -6.559e-08  2.101e-08  -3.121 0.001803 ** 
xlarge.bags                1.365e-06  2.037e-07   6.702 2.12e-11 ***
typeorganic                4.921e-01  4.036e-03 121.909  < 2e-16 ***
year                       5.321e-02  2.117e-03  25.135  < 2e-16 ***
regionAtlanta             -2.228e-01  1.985e-02 -11.224  < 2e-16 ***
regionBaltimoreWashington -2.373e-02  1.987e-02  -1.194 0.232363    
regionBoise               -2.131e-01  1.983e-02 -10.746  < 2e-16 ***
regionBoston              -2.727e-02  1.986e-02  -1.373 0.169619    
regionBuffaloRochester    -4.383e-02  1.983e-02  -2.210 0.027107 *  
regionCalifornia          -1.736e-01  2.028e-02  -8.559  < 2e-16 ***
regionCharlotte            4.526e-02  1.984e-02   2.282 0.022522 *  
regionChicago             -2.472e-03  1.994e-02  -0.124 0.901307    
regionCincinnatiDayton    -3.495e-01  1.985e-02 -17.608  < 2e-16 ***
regionColumbus            -3.090e-01  1.983e-02 -15.578  < 2e-16 ***
regionDallasFtWorth       -4.763e-01  1.987e-02 -23.968  < 2e-16 ***
regionDenver              -3.335e-01  1.995e-02 -16.713  < 2e-16 ***
regionDetroit             -2.906e-01  1.987e-02 -14.624  < 2e-16 ***
regionGrandRapids         -5.871e-02  1.984e-02  -2.960 0.003082 ** 
regionGreatLakes          -2.265e-01  2.047e-02 -11.069  < 2e-16 ***
regionHarrisburgScranton  -4.756e-02  1.983e-02  -2.398 0.016482 *  
regionHartfordSpringfield  2.587e-01  1.984e-02  13.043  < 2e-16 ***
regionHouston             -5.112e-01  1.987e-02 -25.734  < 2e-16 ***
regionIndianapolis        -2.469e-01  1.983e-02 -12.448  < 2e-16 ***
regionJacksonville        -5.003e-02  1.983e-02  -2.522 0.011665 *  
regionLasVegas            -1.785e-01  1.984e-02  -8.997  < 2e-16 ***
regionLosAngeles          -3.555e-01  2.015e-02 -17.644  < 2e-16 ***
regionLouisville          -2.741e-01  1.983e-02 -13.821  < 2e-16 ***
regionMiamiFtLauderdale   -1.326e-01  1.986e-02  -6.678 2.49e-11 ***
regionMidsouth            -1.472e-01  2.003e-02  -7.350 2.06e-13 ***
regionNashville           -3.491e-01  1.983e-02 -17.603  < 2e-16 ***
regionNewOrleansMobile    -2.584e-01  1.984e-02 -13.026  < 2e-16 ***
regionNewYork              1.746e-01  2.002e-02   8.721  < 2e-16 ***
regionNortheast            6.281e-02  2.145e-02   2.929 0.003407 ** 
regionNorthernNewEngland  -8.176e-02  1.984e-02  -4.121 3.79e-05 ***
regionOrlando             -5.500e-02  1.984e-02  -2.772 0.005570 ** 
regionPhiladelphia         7.308e-02  1.984e-02   3.683 0.000231 ***
regionPhoenixTucson       -3.355e-01  1.989e-02 -16.867  < 2e-16 ***
regionPittsburgh          -1.966e-01  1.983e-02  -9.915  < 2e-16 ***
regionPlains              -1.258e-01  1.989e-02  -6.328 2.54e-10 ***
regionPortland            -2.399e-01  1.985e-02 -12.083  < 2e-16 ***
regionRaleighGreensboro   -5.601e-03  1.984e-02  -0.282 0.777695    
regionRichmondNorfolk     -2.697e-01  1.983e-02 -13.600  < 2e-16 ***
regionRoanoke             -3.132e-01  1.983e-02 -15.791  < 2e-16 ***
regionSacramento           6.037e-02  1.983e-02   3.044 0.002337 ** 
regionSanDiego            -1.623e-01  1.984e-02  -8.183 2.96e-16 ***
regionSanFrancisco         2.444e-01  1.985e-02  12.312  < 2e-16 ***
regionSeattle             -1.148e-01  1.985e-02  -5.784 7.43e-09 ***
regionSouthCarolina       -1.579e-01  1.984e-02  -7.963 1.78e-15 ***
regionSouthCentral        -4.613e-01  2.053e-02 -22.466  < 2e-16 ***
regionSoutheast           -1.614e-01  2.038e-02  -7.920 2.51e-15 ***
regionSpokane             -1.154e-01  1.983e-02  -5.819 6.03e-09 ***
regionStLouis             -1.309e-01  1.983e-02  -6.600 4.23e-11 ***
regionSyracuse            -4.080e-02  1.983e-02  -2.058 0.039646 *  
regionTampa               -1.521e-01  1.984e-02  -7.664 1.89e-14 ***
regionTotalUS             -1.879e-01  2.446e-02  -7.685 1.61e-14 ***
regionWest                -2.554e-01  2.067e-02 -12.355  < 2e-16 ***
regionWestTexNewMexico    -2.962e-01  1.992e-02 -14.873  < 2e-16 ***
month                     -2.037e-02  6.578e-03  -3.097 0.001956 ** 
week                       9.460e-03  1.500e-03   6.307 2.90e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2578 on 18185 degrees of freedom
Multiple R-squared:  0.5916,    Adjusted R-squared:  0.5902 
F-statistic: 418.1 on 63 and 18185 DF,  p-value: < 2.2e-16
model <- lm(averageprice ~ ., data = avocado_tidy)
par(mfrow = c(2, 2))
plot(model)

looking for a lower error (RMSE) and a higher R squared value

cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# much simpler model
model1 <- train(averageprice ~ type + region, 
                data = avocado_tidy,
                trControl = cv_10_fold,
                method = "lm")

mean(model1$resample$RMSE)
[1] 0.2716248
mean(model1$resample$Rsquared)
[1] 0.5450222

wrong direction

cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# even simpler
model2 <- train(averageprice ~ type, 
                data = avocado_tidy,
                trControl = cv_10_fold,
                method = "lm")

mean(model2$resample$RMSE)
[1] 0.3172512
mean(model2$resample$Rsquared)
[1] 0.3793824

even worse

cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)

model3 <- train(averageprice ~ type + month, 
                data = avocado_tidy,
                trControl = cv_10_fold,
                method = "lm")

mean(model3$resample$RMSE)
[1] 0.3104094
mean(model3$resample$Rsquared)
[1] 0.4058502

error reduced slightly and R squared increased

cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)

model4 <- train(averageprice ~ type + year + region + week + month, 
                data = avocado_tidy,
                trControl = cv_10_fold,
                method = "lm")

mean(model4$resample$RMSE)
[1] 0.2585822
mean(model4$resample$Rsquared)
[1] 0.5873909

error reduced slightly further and R squared increased quite a bit

model4$resample
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)

model5 <- train(log(averageprice) ~ type + year + region + week + month, 
                data = avocado_tidy,
                trControl = cv_10_fold,
                method = "lm")

mean(model5$resample$RMSE)
[1] 0.1828585
mean(model5$resample$Rsquared)
[1] 0.6016292
model5$resample
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)

model6 <- train(log(averageprice) ~ type + year + region + week + month + region:type, 
                data = avocado_tidy,
                trControl = cv_10_fold,
                method = "lm")

mean(model6$resample$RMSE)
[1] 0.1730975
mean(model6$resample$Rsquared)
[1] 0.6432118
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# adding an interaction
model7 <- train(log(averageprice) ~ type + year + region + week + month + region:type + region:month, 
                data = avocado_tidy,
                trControl = cv_10_fold,
                method = "lm")

mean(model7$resample$RMSE)
[1] 0.1713983
mean(model7$resample$Rsquared)
[1] 0.6501079
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# adding a further interaction
model8 <- train(log(averageprice) ~ type + year + region + week + month + region:type + region:week, 
                data = avocado_tidy,
                trControl = cv_10_fold,
                method = "lm")

mean(model8$resample$RMSE)
[1] 0.1714282
mean(model8$resample$Rsquared)
[1] 0.6498005
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# adding a further interaction
model9 <- train(log(averageprice) ~ type + year + region + week + month + large.bags + region:type + region:week, 
                data = avocado_tidy,
                trControl = cv_10_fold,
                method = "lm")

mean(model9$resample$RMSE)
[1] 0.1711208
mean(model9$resample$Rsquared)
[1] 0.6511975
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# adding a further interaction
model10 <- train(log(averageprice) ~ type + year + region + week + month + large.bags + x4046 + region:type + region:week, 
                data = avocado_tidy,
                trControl = cv_10_fold,
                method = "lm")

mean(model10$resample$RMSE)
[1] 0.1706929
mean(model10$resample$Rsquared)
[1] 0.6529101
model_best <- lm(log(averageprice) ~ type + year + region + week + month + large.bags + region:type + region:week, data = avocado_tidy)
broom::glance(model_best)
LS0tCnRpdGxlOiAiQXZvY2FkbyBNb2RlbCBCdWlsZGluZyBIb21ld29yayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIzEgTVZQCldl4oCZdmUgbG9va2VkIGF0IGEgZmV3IGRpZmZlcmVudCB3YXlzIGluIHdoaWNoIHdlIGNhbiBidWlsZCBtb2RlbHMgdGhpcyB3ZWVrLCBpbmNsdWRpbmcgaG93IHRvIHByZXBhcmUgdGhlbSBwcm9wZXJseS4gVGhpcyB3ZWVrZW5kIHdl4oCZbGwgYnVpbGQgYSBtdWx0aXBsZSBsaW5lYXIgcmVncmVzc2lvbiBtb2RlbCBvbiBhIGRhdGFzZXQgd2hpY2ggd2lsbCBuZWVkIHNvbWUgcHJlcGFyYXRpb24uIFRoZSBkYXRhIGhhcyBjb21lIGZyb20gS2FnZ2xlIGFuZCBjYW4gYmUgZm91bmQgaW4gdGhlIGRhdGEgZm9sZGVyLgoKV2Ugd2FudCB0byBtb2RlbCBhdm9jYWRvIHNhbGVzLiBZb3XigJlsbCBuZWVkIHRvIGlkZW50aWZ5IHRoZSB0YXJnZXQgdmFyaWFibGUgYW5kIHVzZSB0aGUgdG9vbHMgd2XigJl2ZSB3b3JrZWQgd2l0aCB0aGlzIHdlZWsgaW4gb3JkZXIgdG8gcHJlcGFyZSB5b3VyIGRhdGFzZXQgYW5kIGZpbmQgYXBwcm9wcmlhdGUgcHJlZGljdG9ycy4gT25jZSB5b3XigJl2ZSBidWlsdCB5b3VyIG1vZGVsIHVzZSB0aGUgdmFsaWRhdGlvbiB0ZWNobmlxdWVzIGRpc2N1c3NlZCBvbiBXZWRuZXNkYXkgdG8gZXZhbHVhdGUgaXQuCgojMiBFeHRlbnNpb25zCkJ1aWxkIGEgZGVjaXNpb24gdHJlZSB0byBtb2RlbCB0aGUgbGlrZWxpaG9vZCBvZiBhIHNhbGUgYmVpbmcgb2YgYW4gb3JnYW5pYyBhdm9jYWRvLgpVc2Ugay1tZWFucyBjbHVzdGVyaW5nIHRvIGludmVzdGlnYXRlIHBvdGVudGlhbCByZWxhdGlvbnNoaXBzIGJldHdlZW4gdGhlIHllYXIgYW5kIHRoZSBhdmVyYWdlIGF2b2NhZG8gcHJpY2UuCgoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQojbG9hZGluZyBpbiBkYXRhCmF2b2NhZG8gPC0gcmVhZC5jc3YoImRhdGEvYXZvY2Fkby5jc3YiKQpgYGAKCgpgYGB7cn0KIyBpbnZlc3RpZ2F0ZSBzdHJ1Y3R1cmUgYW5kIHN1bW1hcnkgb2YgZGF0YQpzdHIoYXZvY2FkbykKc3VtbWFyeShhdm9jYWRvKQpgYGAKCmBgYHtyfQojIGNoZWNrIGZvciBtaXNzaW5nIHZhbHVlcwphcHBseShhdm9jYWRvLCAyLCBmdW5jdGlvbih4KSBhbnkoaXMubmEoeCkgfCBpcy5pbmZpbml0ZSh4KSB8IGlzLm51bGwoeCkpKQpgYGAKCgpgYGB7cn0KIyBnZXQgcmlkIG9mIHNwYWNlcyBpbiBjb2x1bW4gbmFtZXMKbmFtZXMoYXZvY2FkbykgPC0gbWFrZS5uYW1lcyhuYW1lcyhhdm9jYWRvKSkKYGBgCgpgYGB7cn0KIyBtYWtlIGFsbCBjb2x1bW4gbmFtZXMgbG93ZXIgY2FzZQpmb3IoIGkgaW4gY29sbmFtZXMoYXZvY2FkbykpIHsKICBjb2xuYW1lcyhhdm9jYWRvKVt3aGljaChjb2xuYW1lcyhhdm9jYWRvKSA9PSBpKV0gPSB0b2xvd2VyKGkpCn0KCmF2b2NhZG8KYGBgCgpgYGB7cn0KbGlicmFyeShsdWJyaWRhdGUpCmF2b2NhZG8keWVhciA8LSB5ZWFyKHltZChhcy5jaGFyYWN0ZXIoYXZvY2FkbyRkYXRlKSkpCmBgYAoKYGBge3J9CmF2b2NhZG8kbW9udGggPC0gbW9udGgoeW1kKGFzLmNoYXJhY3Rlcihhdm9jYWRvJGRhdGUpKSkKYGBgCgpgYGB7cn0KYXZvY2FkbyR3ZWVrIDwtIHdlZWsoeW1kKGFzLmNoYXJhY3Rlcihhdm9jYWRvJGRhdGUpKSkKYGBgCgoKYGBge3J9CmdsaW1wc2UoYXZvY2FkbykKYGBgCgoKYGBge3J9CmF2b2NhZG8gJT4lCiAgZ2dwbG90KGFlcyh4ID0gYXZlcmFnZXByaWNlKSkgKwogIGdlb21faGlzdG9ncmFtKCkgKwogIGZhY2V0X3dyYXAofiB5ZWFyKQojIHNlZW1zIHRvIGJlIGxlc3MgZGF0YSBpbiAyMDE4IGNvbXBhcmVkIHRvIG90aGVyIHllYXJzIAoKI3RhYmxlKGF2b2NhZG8keWVhcikKYGBgCgpgYGB7cn0KYXZvY2FkbyAlPiUKICBnZ3Bsb3QoYWVzKHggPSBhdmVyYWdlcHJpY2UsIGZpbGwgPSB0eXBlKSkgKwogIGdlb21faGlzdG9ncmFtKCkgKwogIGZhY2V0X3dyYXAofiBtb250aCkKCiN0YWJsZShhdm9jYWRvJG1vbnRoKQpgYGAKCmBgYHtyfQphdm9jYWRvICU+JQogIGdncGxvdChhZXMoeCA9IGF2ZXJhZ2VwcmljZSwgZmlsbCA9IHR5cGUpKSArCiAgZ2VvbV9oaXN0b2dyYW0oKSArCiAgZmFjZXRfd3JhcCh+IHdlZWspCgojdGFibGUoYXZvY2FkbyR3ZWVrKQpgYGAKCmBgYHtyfQphdm9jYWRvICU+JQogIGdncGxvdChhZXMoeCA9IGFzLmZhY3Rvcih5ZWFyKSwgeSA9IGF2ZXJhZ2VwcmljZSwgZ3JvdXAgPSB5ZWFyKSkgKwogIGdlb21fYm94cGxvdCgpCmBgYAoKYGBge3J9CmF2b2NhZG8gJT4lCiAgZ2dwbG90KGFlcyh4ID0gYXMuZmFjdG9yKG1vbnRoKSwgeSA9IGF2ZXJhZ2VwcmljZSwgZ3JvdXAgPSBtb250aCkpICsKICBnZW9tX2JveHBsb3QoKQpgYGAKCmBgYHtyfQphdm9jYWRvICU+JQogIGdncGxvdChhZXMoeCA9IGFzLmZhY3Rvcih0eXBlKSwgeSA9IGF2ZXJhZ2VwcmljZSkpICsKICBnZW9tX2JveHBsb3QoKSArCiAgY29vcmRfZmxpcCgpCiNtdXN0IGdyZWF0ZXIgc3ByZWFkIG9mIHByaWNlcyBmb3Igb3JnYW5pYwpgYGAKCmBgYHtyfQp2YXIoYXZvY2FkbyRhdmVyYWdlcHJpY2UpCm1lYW4oYXZvY2FkbyRhdmVyYWdlcHJpY2UpCnNkKGF2b2NhZG8kYXZlcmFnZXByaWNlKQoKI2Fzc3VtaW5nIG5vcm1hbGx5IGRpc3RyaWJ1dGVkIGF2ZXJhZ2VwcmljZQojYXBwcm94IDY4IHBlcmNlbnQgb2YgYXZvY2Fkb3MgaW4gb3VyIGRhdGEgc29sZCBhdCBwcmljZXMgYmV0d2VlbiAkMS40MDU5NzggLSAkMC40MDI2NzY2ID0gJDEuMDAzMzAyIGFuZCAkMS40MDU5NzggKyAkMC40MDI2NzY2ID0gJDEuODA4NjU1CmBgYApgYGB7cn0KbWVhbihhdm9jYWRvJGF2ZXJhZ2VwcmljZSkgLSBzZChhdm9jYWRvJGF2ZXJhZ2VwcmljZSkKbWVhbihhdm9jYWRvJGF2ZXJhZ2VwcmljZSkgKyBzZChhdm9jYWRvJGF2ZXJhZ2VwcmljZSkKYGBgCgpgYGB7cn0KYXZvY2FkbyAlPiUKICBnZ3Bsb3QoYWVzKHggPSBhdmVyYWdlcHJpY2UsIGZpbGwgPSB0eXBlKSkgKwogIGdlb21faGlzdG9ncmFtKCkKYGBgCgoKYGBge3J9CmF2b2NhZG8gJT4lCiAgZ2dwbG90KGFlcyh4ID0gYXZlcmFnZXByaWNlLCBmaWxsID0gdHlwZSkpICsKICBnZW9tX2hpc3RvZ3JhbSgpICsKICBmYWNldF93cmFwKH4gdHlwZSkKI21vcmUgY29udmVudGlvbmFsIHRoYW4gb3JnYW5pYyBzb2xkIGJ1dCBhdCBnZW5lcmFsbHkgbG93ZXIgcHJpY2UuCmBgYAoKCgpgYGB7cn0KYXZvY2FkbyAlPiUKICBnZ3Bsb3QoYWVzKHggPSByZWdpb24sIHkgPSBhdmVyYWdlcHJpY2UsIGNvbG9yID0gYXMuZmFjdG9yKHllYXIpKSkgKwogIGdlb21fcG9pbnQoKSArCiAgY29vcmRfZmxpcCgpICsKICBmYWNldF93cmFwKH4geWVhcikKIyBncmVhdCBkZWFsIG9mIHZhcmlhYmlsaXR5IG9mIHByaWNlIGFyb3VuZCByZWdpb24KYGBgCgpgYGB7cn0KYXZvY2FkbyAlPiUKICBnZ3Bsb3QoYWVzKHggPSByZWdpb24sIHkgPSBhdmVyYWdlcHJpY2UsIGNvbG9yID0gYXMuZmFjdG9yKG1vbnRoKSkpICsKICBnZW9tX3BvaW50KCkgKwogIGNvb3JkX2ZsaXAoKSArCiAgZmFjZXRfd3JhcCh+IG1vbnRoKQojIGdyZWF0IGRlYWwgb2YgdmFyaWFiaWxpdHkgb2YgcHJpY2UgYXJvdW5kIHJlZ2lvbiBhbmQgbW9udGgKYGBgCgoKYGBge3J9CmF2b2NhZG8gJT4lCiAgZ2dwbG90KGFlcyh4ID0gcmVnaW9uLCB5ID0gYXZlcmFnZXByaWNlLCBjb2xvciA9IGFzLmZhY3Rvcih5ZWFyKSkpICsKICBnZW9tX3BvaW50KCkgKwogIGNvb3JkX2ZsaXAoKSArCiAgZmFjZXRfd3JhcCh+IHR5cGUpCgojIGFuZCB2YXJpYWJpbGl0eSBhcm91bmQgeWVhciB0b28KYGBgCgpgYGB7cn0KYXZvY2FkbyAlPiUKICBnZ3Bsb3QoYWVzKHggPSB5bWQoYXMuY2hhcmFjdGVyKGF2b2NhZG8kZGF0ZSkpLCB5ID0gYXZlcmFnZXByaWNlLCBjb2xvciA9IHR5cGUpKSArCiAgZ2VvbV9saW5lKCkgKwogIGZhY2V0X3dyYXAofiB0eXBlLCBuY29sID0gMSkKCmBgYAoKCmBgYHtyfQpyZWdpb25fdGFibGUgPC0gdGFibGUoYXZvY2FkbyRyZWdpb24pCnJvdW5kKHByb3AudGFibGUocmVnaW9uX3RhYmxlKSAqMTAwLCBkaWdpdHMgPSAxKQpgYGAKCmBgYHtyfQp0eXBlX3RhYmxlIDwtIHRhYmxlKGF2b2NhZG8kdHlwZSkKcm91bmQocHJvcC50YWJsZSh0eXBlX3RhYmxlKSAqIDEwMCwgZGlnaXRzID0gMSkKYGBgCgpgYGB7cn0KdGFibGUoYXZvY2FkbyR5ZWFyKQpgYGAKCgpgYGB7cn0Kc3VtbWFyeShhdm9jYWRvKQpgYGAKCmBgYHtyfQpsaWJyYXJ5KHBzeWNoKQpgYGAKYGBge3J9CnBhaXJzLnBhbmVscyhhdm9jYWRvW2MoImF2ZXJhZ2VwcmljZSIsICJ4NDA0NiIsICJ4NDIyNSIsICJ4NDc3MCIsICJzbWFsbC5iYWdzIiwgImxhcmdlLmJhZ3MiLCAieGxhcmdlLmJhZ3MiLCAidHlwZSIsICJyZWdpb24iLCAibW9udGgiLCAid2VlayIsICJ5ZWFyIildKQpgYGAKYGBge3J9CnN1bW1hcnkoYXZvY2FkbykKYGBgCgpgYGB7cn0KIyB0aWR5IHVwIGRhdGEKYXZvY2Fkb190aWR5IDwtIGF2b2NhZG8gJT4lCiAgc2VsZWN0KC1jKCJ4IiwgImRhdGUiLCAidG90YWwudm9sdW1lIiwgInRvdGFsLmJhZ3MiKSkKCmdsaW1wc2UoYXZvY2Fkb190aWR5KQojIHggaXMganVzdCByZWZlcmVuY2Ugc28gbm8gcHJlZGljdGl2ZSBwb3dlciwgdGFrZW4gb3V0IGRhdGUgYXMgSSBoYXZlIHllYXIsIHdlZWsgYW5kIG1vbnRoLiB0b3RhbC52b2x1bWUgYW5kIHRvdGFsLmJhZ3MgY2FuIGJlIGRlcml2ZWQKIyBmcm9tIHRoZSBvdGhlciBkYXRhCmBgYAoKYGBge3J9CiMgY2hhbmdpbmcgdHlwZSB0byBsb2dpY2FsIGFzIHRoZXJlIGFyZSBvbmx5IHR3byB0eXBlcyBvZiBhdm9jYWRvIC0gZGlkbid0IGJvdGhlciB3aXRoIHRoaXMgYXMgbm90aWNlZCB0aGUgbW9kZWwgZGlkIGFsbCBvZiB0aGlzIGF1dG9tYXRpY2FsbHkuCiNhdm9jYWRvX3RpZHkkaXMub3JnYW5pYyA8LSB3aXRoKGF2b2NhZG9fdGlkeSwgdHlwZT09Im9yZ2FuaWMiKQpgYGAKCmBgYHtyfQojIHVzaW5nIGFsaWFzIHRvIGNoZWNrIGlzIHRoZXJlIGFyZSBhbnkgYWxpYXNlZCB2YWlyYWJsZXMgfCB0aGlzIHJlc3VsdCAoSSBhc3N1bWUpIG1lYW5zIG5vIGFsaWFzZWQgdmFyaWFibGVzCmFsaWFzKGF2ZXJhZ2VwcmljZSB+IC4sIGRhdGEgPSBhdm9jYWRvX3RpZHkpCmBgYAoKYGBge3J9CmF2b2NhZG9fdGlkeSAlPiUKICBnZ3Bsb3QoYWVzKHggPSBhdmVyYWdlcHJpY2UsIHkgPSB0eXBlKSkgKwogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV9zbW9vdGgobWV0aG9kID0gImxtIiwgc2UgPSBGQUxTRSkKYGBgCgoKYGBge3J9CmxpYnJhcnkoY2FyZXQpCmBgYAoKYGBge3J9CiMgdXNpbmcgSy1mb2xkIGNyb3NzIHZhbGlkYXRpb24KIyB1c2luZyAxMCBmb2xkcwojIGZpcnN0IG1vZGVsIGhhcyBldmVyeXRoaW5nIGluY2x1ZGVkIQpjdl8xMF9mb2xkIDwtIHRyYWluQ29udHJvbChtZXRob2QgPSAiY3YiLCBudW1iZXIgPSAxMCwgc2F2ZVByZWRpY3Rpb25zID0gVFJVRSkKCm1vZGVsIDwtIHRyYWluKGF2ZXJhZ2VwcmljZSB+IC4sIGRhdGEgPSBhdm9jYWRvX3RpZHksCiAgICAgICAgICAgICAgIHRyQ29udHJvbCA9IGN2XzEwX2ZvbGQsCiAgICAgICAgICAgICAgIG1ldGhvZCA9ICJsbSIpCmBgYAoKYGBge3J9Cm1vZGVsJHByZWQKYGBgCgpgYGB7cn0KbW9kZWwkcmVzYW1wbGUKYGBgCgpgYGB7cn0KbWVhbihtb2RlbCRyZXNhbXBsZSRSTVNFKQpgYGAKCmBgYHtyfQptZWFuKG1vZGVsJHJlc2FtcGxlJFJzcXVhcmVkKQpgYGAKCmBgYHtyfQpzdW1tYXJ5KG1vZGVsKQpgYGAKYGBge3J9Cm1vZGVsIDwtIGxtKGF2ZXJhZ2VwcmljZSB+IC4sIGRhdGEgPSBhdm9jYWRvX3RpZHkpCnBhcihtZnJvdyA9IGMoMiwgMikpCnBsb3QobW9kZWwpCmBgYAoKCmxvb2tpbmcgZm9yIGEgbG93ZXIgZXJyb3IgKFJNU0UpIGFuZCBhIGhpZ2hlciBSIHNxdWFyZWQgdmFsdWUKYGBge3J9CmN2XzEwX2ZvbGQgPC0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDEwLCBzYXZlUHJlZGljdGlvbnMgPSBUUlVFKQojIG11Y2ggc2ltcGxlciBtb2RlbAptb2RlbDEgPC0gdHJhaW4oYXZlcmFnZXByaWNlIH4gdHlwZSArIHJlZ2lvbiwgCiAgICAgICAgICAgICAgICBkYXRhID0gYXZvY2Fkb190aWR5LAogICAgICAgICAgICAgICAgdHJDb250cm9sID0gY3ZfMTBfZm9sZCwKICAgICAgICAgICAgICAgIG1ldGhvZCA9ICJsbSIpCgptZWFuKG1vZGVsMSRyZXNhbXBsZSRSTVNFKQptZWFuKG1vZGVsMSRyZXNhbXBsZSRSc3F1YXJlZCkKYGBgCndyb25nIGRpcmVjdGlvbgoKCgpgYGB7cn0KY3ZfMTBfZm9sZCA8LSB0cmFpbkNvbnRyb2wobWV0aG9kID0gImN2IiwgbnVtYmVyID0gMTAsIHNhdmVQcmVkaWN0aW9ucyA9IFRSVUUpCiMgZXZlbiBzaW1wbGVyCm1vZGVsMiA8LSB0cmFpbihhdmVyYWdlcHJpY2UgfiB0eXBlLCAKICAgICAgICAgICAgICAgIGRhdGEgPSBhdm9jYWRvX3RpZHksCiAgICAgICAgICAgICAgICB0ckNvbnRyb2wgPSBjdl8xMF9mb2xkLAogICAgICAgICAgICAgICAgbWV0aG9kID0gImxtIikKCm1lYW4obW9kZWwyJHJlc2FtcGxlJFJNU0UpCm1lYW4obW9kZWwyJHJlc2FtcGxlJFJzcXVhcmVkKQpgYGAKZXZlbiB3b3JzZQoKYGBge3J9CmN2XzEwX2ZvbGQgPC0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDEwLCBzYXZlUHJlZGljdGlvbnMgPSBUUlVFKQoKbW9kZWwzIDwtIHRyYWluKGF2ZXJhZ2VwcmljZSB+IHR5cGUgKyBtb250aCwgCiAgICAgICAgICAgICAgICBkYXRhID0gYXZvY2Fkb190aWR5LAogICAgICAgICAgICAgICAgdHJDb250cm9sID0gY3ZfMTBfZm9sZCwKICAgICAgICAgICAgICAgIG1ldGhvZCA9ICJsbSIpCgptZWFuKG1vZGVsMyRyZXNhbXBsZSRSTVNFKQptZWFuKG1vZGVsMyRyZXNhbXBsZSRSc3F1YXJlZCkKYGBgCmVycm9yIHJlZHVjZWQgc2xpZ2h0bHkgYW5kIFIgc3F1YXJlZCBpbmNyZWFzZWQKCmBgYHtyfQpjdl8xMF9mb2xkIDwtIHRyYWluQ29udHJvbChtZXRob2QgPSAiY3YiLCBudW1iZXIgPSAxMCwgc2F2ZVByZWRpY3Rpb25zID0gVFJVRSkKCm1vZGVsNCA8LSB0cmFpbihhdmVyYWdlcHJpY2UgfiB0eXBlICsgeWVhciArIHJlZ2lvbiArIHdlZWsgKyBtb250aCwgCiAgICAgICAgICAgICAgICBkYXRhID0gYXZvY2Fkb190aWR5LAogICAgICAgICAgICAgICAgdHJDb250cm9sID0gY3ZfMTBfZm9sZCwKICAgICAgICAgICAgICAgIG1ldGhvZCA9ICJsbSIpCgptZWFuKG1vZGVsNCRyZXNhbXBsZSRSTVNFKQptZWFuKG1vZGVsNCRyZXNhbXBsZSRSc3F1YXJlZCkKYGBgCmVycm9yIHJlZHVjZWQgc2xpZ2h0bHkgZnVydGhlciBhbmQgUiBzcXVhcmVkIGluY3JlYXNlZCBxdWl0ZSBhIGJpdApgYGB7cn0KbW9kZWw0JHJlc2FtcGxlCmBgYAoKYGBge3J9CmN2XzEwX2ZvbGQgPC0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDEwLCBzYXZlUHJlZGljdGlvbnMgPSBUUlVFKQoKbW9kZWw1IDwtIHRyYWluKGxvZyhhdmVyYWdlcHJpY2UpIH4gdHlwZSArIHllYXIgKyByZWdpb24gKyB3ZWVrICsgbW9udGgsIAogICAgICAgICAgICAgICAgZGF0YSA9IGF2b2NhZG9fdGlkeSwKICAgICAgICAgICAgICAgIHRyQ29udHJvbCA9IGN2XzEwX2ZvbGQsCiAgICAgICAgICAgICAgICBtZXRob2QgPSAibG0iKQoKbWVhbihtb2RlbDUkcmVzYW1wbGUkUk1TRSkKbWVhbihtb2RlbDUkcmVzYW1wbGUkUnNxdWFyZWQpCmBgYApgYGB7cn0KbW9kZWw1JHJlc2FtcGxlCmBgYAoKYGBge3J9CmN2XzEwX2ZvbGQgPC0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDEwLCBzYXZlUHJlZGljdGlvbnMgPSBUUlVFKQoKbW9kZWw2IDwtIHRyYWluKGxvZyhhdmVyYWdlcHJpY2UpIH4gdHlwZSArIHllYXIgKyByZWdpb24gKyB3ZWVrICsgbW9udGggKyByZWdpb246dHlwZSwgCiAgICAgICAgICAgICAgICBkYXRhID0gYXZvY2Fkb190aWR5LAogICAgICAgICAgICAgICAgdHJDb250cm9sID0gY3ZfMTBfZm9sZCwKICAgICAgICAgICAgICAgIG1ldGhvZCA9ICJsbSIpCgptZWFuKG1vZGVsNiRyZXNhbXBsZSRSTVNFKQptZWFuKG1vZGVsNiRyZXNhbXBsZSRSc3F1YXJlZCkKYGBgCgpgYGB7cn0KY3ZfMTBfZm9sZCA8LSB0cmFpbkNvbnRyb2wobWV0aG9kID0gImN2IiwgbnVtYmVyID0gMTAsIHNhdmVQcmVkaWN0aW9ucyA9IFRSVUUpCiMgYWRkaW5nIGFuIGludGVyYWN0aW9uCm1vZGVsNyA8LSB0cmFpbihsb2coYXZlcmFnZXByaWNlKSB+IHR5cGUgKyB5ZWFyICsgcmVnaW9uICsgd2VlayArIG1vbnRoICsgcmVnaW9uOnR5cGUgKyByZWdpb246bW9udGgsIAogICAgICAgICAgICAgICAgZGF0YSA9IGF2b2NhZG9fdGlkeSwKICAgICAgICAgICAgICAgIHRyQ29udHJvbCA9IGN2XzEwX2ZvbGQsCiAgICAgICAgICAgICAgICBtZXRob2QgPSAibG0iKQoKbWVhbihtb2RlbDckcmVzYW1wbGUkUk1TRSkKbWVhbihtb2RlbDckcmVzYW1wbGUkUnNxdWFyZWQpCmBgYAoKYGBge3J9CmN2XzEwX2ZvbGQgPC0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDEwLCBzYXZlUHJlZGljdGlvbnMgPSBUUlVFKQojIGFkZGluZyBhIGZ1cnRoZXIgaW50ZXJhY3Rpb24KbW9kZWw4IDwtIHRyYWluKGxvZyhhdmVyYWdlcHJpY2UpIH4gdHlwZSArIHllYXIgKyByZWdpb24gKyB3ZWVrICsgbW9udGggKyByZWdpb246dHlwZSArIHJlZ2lvbjp3ZWVrLCAKICAgICAgICAgICAgICAgIGRhdGEgPSBhdm9jYWRvX3RpZHksCiAgICAgICAgICAgICAgICB0ckNvbnRyb2wgPSBjdl8xMF9mb2xkLAogICAgICAgICAgICAgICAgbWV0aG9kID0gImxtIikKCm1lYW4obW9kZWw4JHJlc2FtcGxlJFJNU0UpCm1lYW4obW9kZWw4JHJlc2FtcGxlJFJzcXVhcmVkKQpgYGAKCmBgYHtyfQpjdl8xMF9mb2xkIDwtIHRyYWluQ29udHJvbChtZXRob2QgPSAiY3YiLCBudW1iZXIgPSAxMCwgc2F2ZVByZWRpY3Rpb25zID0gVFJVRSkKIyBhZGRpbmcgYSBmdXJ0aGVyIGludGVyYWN0aW9uCm1vZGVsOSA8LSB0cmFpbihsb2coYXZlcmFnZXByaWNlKSB+IHR5cGUgKyB5ZWFyICsgcmVnaW9uICsgd2VlayArIG1vbnRoICsgbGFyZ2UuYmFncyArIHJlZ2lvbjp0eXBlICsgcmVnaW9uOndlZWssIAogICAgICAgICAgICAgICAgZGF0YSA9IGF2b2NhZG9fdGlkeSwKICAgICAgICAgICAgICAgIHRyQ29udHJvbCA9IGN2XzEwX2ZvbGQsCiAgICAgICAgICAgICAgICBtZXRob2QgPSAibG0iKQoKbWVhbihtb2RlbDkkcmVzYW1wbGUkUk1TRSkKbWVhbihtb2RlbDkkcmVzYW1wbGUkUnNxdWFyZWQpCmBgYAoKYGBge3J9CmN2XzEwX2ZvbGQgPC0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDEwLCBzYXZlUHJlZGljdGlvbnMgPSBUUlVFKQojIGFkZGluZyBhIGZ1cnRoZXIgaW50ZXJhY3Rpb24KbW9kZWwxMCA8LSB0cmFpbihsb2coYXZlcmFnZXByaWNlKSB+IHR5cGUgKyB5ZWFyICsgcmVnaW9uICsgd2VlayArIG1vbnRoICsgbGFyZ2UuYmFncyArIHg0MDQ2ICsgcmVnaW9uOnR5cGUgKyByZWdpb246d2VlaywgCiAgICAgICAgICAgICAgICBkYXRhID0gYXZvY2Fkb190aWR5LAogICAgICAgICAgICAgICAgdHJDb250cm9sID0gY3ZfMTBfZm9sZCwKICAgICAgICAgICAgICAgIG1ldGhvZCA9ICJsbSIpCgptZWFuKG1vZGVsMTAkcmVzYW1wbGUkUk1TRSkKbWVhbihtb2RlbDEwJHJlc2FtcGxlJFJzcXVhcmVkKQpgYGAKCgoKYGBge3J9Cm1vZGVsX2Jlc3QgPC0gbG0obG9nKGF2ZXJhZ2VwcmljZSkgfiB0eXBlICsgeWVhciArIHJlZ2lvbiArIHdlZWsgKyBtb250aCArIGxhcmdlLmJhZ3MgKyByZWdpb246dHlwZSArIHJlZ2lvbjp3ZWVrLCBkYXRhID0gYXZvY2Fkb190aWR5KQpgYGAKCmBgYHtyfQpicm9vbTo6Z2xhbmNlKG1vZGVsX2Jlc3QpCmBgYAoK